This notebook is all about benchmarking some R code used in this package.
Hardware / Software used:
- Intel i7-4600U
- Compilation flags for C/C++:
-O2 -Wall $(DEBUGFLAG) -mtune=core2 (R’s defaults)
- Windows Server 2012 R2
- R 3.3.2 + Intel MKL
Libraries
library(data.table)
data.table 1.10.4
The fastest way to learn (by data.table authors): https://www.datacamp.com/courses/data-analysis-the-data-table-way
Documentation: ?data.table, example(data.table) and browseVignettes("data.table")
Release notes, videos and slides: http://r-datatable.com
library(microbenchmark)
library(Rcpp)
library(ggplot2)
library(plotly)
Attaching package: <U+393C><U+3E31>plotly<U+393C><U+3E32>
The following object is masked from <U+393C><U+3E31>package:ggplot2<U+393C><U+3E32>:
last_plot
The following object is masked from <U+393C><U+3E31>package:stats<U+393C><U+3E32>:
filter
The following object is masked from <U+393C><U+3E31>package:graphics<U+393C><U+3E32>:
layout
# Helper function to print data well in tables
print_well <- function(data, digits = 6) {
# To milliseconds
data <- data / 1000000
# Sprintf helper
sprintf_helper <- paste0("%.0", digits, "f")
cat("| Min | 25% | 50% | 75% | Max | Mean | \n| --: | --: | --: | --: | --: | --: | \n| ", sprintf(sprintf_helper, min(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.25)), " | ", sprintf(sprintf_helper, median(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.75)), " | ", sprintf(sprintf_helper, max(data)), " | ", sprintf(sprintf_helper, mean(data)), " | \n", sep = "")
return(data)
}
# Test case function
# Arguments renamed to avoid recursive clash
test_case <- function(f, preds, labels, eps) {
cat("Test case: ", do.call(f, list(preds = preds[1:50],
labels = labels[1:50],
eps = 1e-15)), " \n")
}
Benchmarking Clamped Vector to Logloss
For a 2-class vector of 1,000,000 observations:
- Vector A of length=(1000000)
- Vector B of length=(1000000) with 2 classes
A = [1, 2, 3, 4, ..., 1000000]
B = [0, 1, 1, 0, ...]
Get the following Vector C and D:
C = Clamped A by 1e-15
D = Mean of logloss(C, B)
Initialize data
# Generate random data
set.seed(11111)
data <- runif(1000000, 0, 1)
labels <- round(runif(1000000, 0, 1), digits = 0)
# How many digits for benchmarking in milliseconds
my_digits <- 6L
# How many runs for benchmarking?
my_runs <- 1000L
Benchmarks
# ===== BLOCK 1 =====
faster1 <- function(preds, labels, eps = 1e-15) {
x <- preds
x[x < eps] <- eps
x[x > (1 - eps)] <- 1 - eps
return(-mean(labels * log(x) + (1 - labels) * log(1 - x)))
}
test_case(faster1, preds = data, labels = labels, eps = 1e-15)
Test case: 0.9837966
data1 <- print_well(microbenchmark(faster1(data, labels), times = my_runs)$time, digits = my_digits)
| 98.236812 |
102.558966 |
104.827241 |
109.353338 |
261.528689 |
110.746225 |
# ===== BLOCK 2 =====
faster2 <- function(preds, labels, eps = 1e-15) {
x <- pmin(pmax(preds, eps), 1 - eps)
return(-mean(labels * log(x) + (1 - labels) * log(1 - x)))
}
test_case(faster2, preds = data, labels = labels, eps = 1e-15)
Test case: 0.9837966
data2 <- print_well(microbenchmark(faster2(data, labels), times = my_runs)$time, digits = my_digits)
| 100.001406 |
104.717001 |
108.238398 |
117.771084 |
210.083657 |
121.705744 |
# ===== BLOCK 3 =====
faster3 <- function(preds, labels, eps = 1e-15) {
x <- preds
x[x < eps] <- eps
x[x > (1 - eps)] <- 1 - eps
return(-1/length(labels) * (sum(labels * log(x) + (1 - labels) * log(1 - x))))
}
test_case(faster3, preds = data, labels = labels, eps = 1e-15)
Test case: 0.9837966
data3 <- print_well(microbenchmark(faster3(data, labels), times = my_runs)$time, digits = my_digits)
| 96.428882 |
101.448587 |
103.667824 |
107.993494 |
199.984946 |
109.273259 |
# ===== BLOCK 4 =====
faster4 <- function(preds, labels, eps = 1e-15) {
x <- pmin(pmax(preds, eps), 1 - eps)
return(-1/length(labels) * (sum(labels * log(x) + (1 - labels) * log(1 - x))))
}
test_case(faster4, preds = data, labels = labels, eps = 1e-15)
Test case: 0.9837966
data4 <- print_well(microbenchmark(faster4(data, labels), times = my_runs)$time, digits = my_digits)
| 98.875441 |
103.396217 |
106.065157 |
111.215343 |
191.767533 |
114.586109 |
# ===== BLOCK 5 =====
cppFunction("double faster5(NumericVector preds, NumericVector labels, double eps) {
NumericVector clamped = clamp(eps, preds, 1 - eps);
NumericVector loggy = -1 * ((labels * log(clamped) + (1 - labels) * log(1 - clamped)));
double logloss = mean(loggy);
return logloss;
}")
test_case(faster5, preds = data, labels = labels, eps = 1e-15)
Test case: 0.9837966
data5 <- print_well(microbenchmark(faster5(data, labels, eps = 1e-15), times = my_runs)$time, digits = my_digits)
| 78.746066 |
80.047844 |
81.832206 |
87.750363 |
166.269487 |
84.050458 |
# ===== BLOCK 6 =====
cppFunction("double faster6(NumericVector preds, NumericVector labels, double eps) {
NumericVector clamped = clamp(eps, preds, 1 - eps);
NumericVector loggy = -1 * ((labels * log(clamped) + (1 - labels) * log(1 - clamped)));
double logloss = sum(loggy)/loggy.size();
return logloss;
}")
test_case(faster6, preds = data, labels = labels, eps = 1e-15)
Test case: 0.9837966
data6 <- print_well(microbenchmark(faster6(data, labels, eps = 1e-15), times = my_runs)$time, digits = my_digits)
| 77.733382 |
79.289472 |
81.178370 |
87.284316 |
165.628577 |
83.651944 |
Summary Results
data_time <- data.table(rbindlist(list(data.frame(Time = data1, Bench = "faster1"),
data.frame(Time = data2, Bench = "faster2"),
data.frame(Time = data3, Bench = "faster3"),
data.frame(Time = data4, Bench = "faster4"),
data.frame(Time = data5, Bench = "faster5"),
data.frame(Time = data6, Bench = "faster6"))))
data_time <- data_time[, t_mean := mean(Time), by = Bench]
data_time <- data_time[, t_median := median(Time), by = Bench]
data_time$Benchs <- data_time$Bench
levels(data_time$Benchs) <- paste0("faster", 1:6, "= [", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(min(Time)), by = Bench]$V1), ", ", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(max(Time)), by = Bench]$V1), "], mean=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(mean(Time)), by = Bench]$V1), ", median=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(median(Time)), by = Bench]$V1))
my_time <- data_time[, list(min(Time), quantile(Time, probs = 0.25), median(Time), quantile(Time, probs = 0.75), max(Time), mean(Time)), by = Bench]
colnames(my_time) <- c("Function", "Min", "25%", "50%", "75%", "Max", "Mean")
my_time <- my_time[order(Mean, decreasing = FALSE), ]
print(my_time, digits = 6)
Plot Results
ggplotly(ggplot(data = data_time, aes(x = Time)) + geom_histogram(aes(y = ..density..), bins = 20, color = "darkblue", fill = "lightblue") + facet_wrap(~ Benchs, ncol = 2) + geom_vline(aes(xintercept = t_mean), color = "blue", linetype = "dashed", size = 2) + geom_vline(aes(xintercept = t_median), color = "red", linetype = "dashed", size = 2) + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_histogram(aes(y = ..density..), bins = 100, position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_density(position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
---
title: "Benchmarks: Logloss"
output:
  html_notebook:
    collapsed: no
    theme: united
    toc: yes
    toc_depth: 6
    toc_float: yes

---

This notebook is all about benchmarking some R code used in this package.

Hardware / Software used:

* Intel i7-4600U
* Compilation flags for C/C++: `-O2 -Wall $(DEBUGFLAG) -mtune=core2` (R's defaults)
* Windows Server 2012 R2
* R 3.3.2 + Intel MKL

# Libraries

```{r init}
library(data.table)
library(microbenchmark)
library(Rcpp)
library(ggplot2)
library(plotly)
```

```{r based}

# Helper function to print data well in tables
print_well <- function(data, digits = 6) {
  
  # To milliseconds
  data <- data / 1000000
  
  # Sprintf helper
  sprintf_helper <- paste0("%.0", digits, "f")
  
  cat("| Min | 25% | 50% | 75% | Max | Mean |  \n| --: | --: | --: | --: | --: | --: |  \n| ", sprintf(sprintf_helper, min(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.25)), " | ", sprintf(sprintf_helper, median(data)), " | ", sprintf(sprintf_helper, quantile(data, probs = 0.75)), " | ", sprintf(sprintf_helper, max(data)), " | ", sprintf(sprintf_helper, mean(data)), " |  \n", sep = "")
  
  return(data)
  
}

# Test case function
# Arguments renamed to avoid recursive clash
test_case <- function(f, preds, labels, eps) {
  cat("Test case: ", do.call(f, list(preds = preds[1:50],
                                     labels = labels[1:50],
                                     eps = 1e-15)), "  \n")
}

```

# Benchmarking Clamped Vector to Logloss

For a 2-class vector of 1,000,000 observations:

* Vector A of length=(1000000)
* Vector B of length=(1000000) with 2 classes

```
A = [1, 2, 3, 4, ..., 1000000]
B = [0, 1, 1, 0, ...]
```

Get the following Vector C and D:

```
C = Clamped A by 1e-15
D = Mean of logloss(C, B)
```

# Initialize data

```{r bench1}
# Generate random data
set.seed(11111)
data <- runif(1000000, 0, 1)
labels <- round(runif(1000000, 0, 1), digits = 0)

# How many digits for benchmarking in milliseconds
my_digits <- 6L

# How many runs for benchmarking?
my_runs <- 1000L
```

# Benchmarks

```{r bench2, results="asis"}

# ===== BLOCK 1 =====
faster1 <- function(preds, labels, eps = 1e-15) {
  x <- preds
  x[x < eps] <- eps
  x[x > (1 - eps)] <- 1 - eps
  return(-mean(labels * log(x) + (1 - labels) * log(1 - x)))
}
test_case(faster1, preds = data, labels = labels, eps = 1e-15)
data1 <- print_well(microbenchmark(faster1(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 2 =====
faster2 <- function(preds, labels, eps = 1e-15) {
  x <- pmin(pmax(preds, eps), 1 - eps)
  return(-mean(labels * log(x) + (1 - labels) * log(1 - x)))
}
test_case(faster2, preds = data, labels = labels, eps = 1e-15)
data2 <- print_well(microbenchmark(faster2(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 3 =====
faster3 <- function(preds, labels, eps = 1e-15) {
  x <- preds
  x[x < eps] <- eps
  x[x > (1 - eps)] <- 1 - eps
  return(-1/length(labels) * (sum(labels * log(x) + (1 - labels) * log(1 - x))))
}
test_case(faster3, preds = data, labels = labels, eps = 1e-15)
data3 <- print_well(microbenchmark(faster3(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 4 =====
faster4 <- function(preds, labels, eps = 1e-15) {
  x <- pmin(pmax(preds, eps), 1 - eps)
  return(-1/length(labels) * (sum(labels * log(x) + (1 - labels) * log(1 - x))))
}
test_case(faster4, preds = data, labels = labels, eps = 1e-15)
data4 <- print_well(microbenchmark(faster4(data, labels), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 5 =====
cppFunction("double faster5(NumericVector preds, NumericVector labels, double eps) {
  NumericVector clamped = clamp(eps, preds, 1 - eps);
  NumericVector loggy = -1 * ((labels * log(clamped) + (1 - labels) * log(1 - clamped)));
  double logloss = mean(loggy);
  return logloss;
}")
test_case(faster5, preds = data, labels = labels, eps = 1e-15)
data5 <- print_well(microbenchmark(faster5(data, labels, eps = 1e-15), times = my_runs)$time, digits = my_digits)

# ===== BLOCK 6 =====
cppFunction("double faster6(NumericVector preds, NumericVector labels, double eps) {
  NumericVector clamped = clamp(eps, preds, 1 - eps);
  NumericVector loggy = -1 * ((labels * log(clamped) + (1 - labels) * log(1 - clamped)));
  double logloss = sum(loggy)/loggy.size();
  return logloss;
}")
test_case(faster6, preds = data, labels = labels, eps = 1e-15)
data6 <- print_well(microbenchmark(faster6(data, labels, eps = 1e-15), times = my_runs)$time, digits = my_digits)
```

# Summary Results

```{r bench3}

data_time <- data.table(rbindlist(list(data.frame(Time = data1, Bench = "faster1"),
                                       data.frame(Time = data2, Bench = "faster2"),
                                       data.frame(Time = data3, Bench = "faster3"),
                                       data.frame(Time = data4, Bench = "faster4"),
                                       data.frame(Time = data5, Bench = "faster5"),
                                       data.frame(Time = data6, Bench = "faster6"))))
data_time <- data_time[, t_mean := mean(Time), by = Bench]
data_time <- data_time[, t_median := median(Time), by = Bench]
data_time$Benchs <- data_time$Bench 
levels(data_time$Benchs) <- paste0("faster", 1:6, "= [", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(min(Time)), by = Bench]$V1), ", ", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(max(Time)), by = Bench]$V1), "], mean=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(mean(Time)), by = Bench]$V1), ", median=", sprintf(paste0("%.0", my_digits, "f"), data_time[, list(median(Time)), by = Bench]$V1))

my_time <- data_time[, list(min(Time), quantile(Time, probs = 0.25), median(Time), quantile(Time, probs = 0.75), max(Time), mean(Time)), by = Bench]
colnames(my_time) <- c("Function", "Min", "25%", "50%", "75%", "Max", "Mean")
my_time <- my_time[order(Mean, decreasing = FALSE), ]
print(my_time, digits = 6)

```

# Plot Results

```{r bench4, fig.height=9, fig.width=10}

ggplotly(ggplot(data = data_time, aes(x = Time)) + geom_histogram(aes(y = ..density..), bins = 20, color = "darkblue", fill = "lightblue") + facet_wrap(~ Benchs, ncol = 2) + geom_vline(aes(xintercept = t_mean), color = "blue", linetype = "dashed", size = 2) + geom_vline(aes(xintercept = t_median), color = "red", linetype = "dashed", size = 2) + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())

```

```{r bench5, fig.height=6, fig.width=10}
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_histogram(aes(y = ..density..), bins = 100, position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
```

```{r bench6, fig.height=6, fig.width=10}
ggplotly(ggplot(data = data_time[, .(Time, Bench)], aes(x = Time, y = ..count.., fill = Bench)) + geom_density(position = "fill") + labs(x = "Time (Milliseconds)", y = "Density") + theme_bw())
```

